Retain only those counties that:
1. have over a certain amount of cases (2000 total in this case)
2. have signal data for at least a certain amount of days (50 in this case)
## Split CASES into lists by geo
geos_case = cases %>% group_keys(geo_value) %>% unlist()
case_list = cases %>% group_by(geo_value) %>% select(geo_value, time_value, value) %>% arrange(time_value) %>% group_split() %>% setNames(geos_case)
## Split SIGNAL into lists by geo
mask_list = masks %>% group_by(geo_value) %>% select(geo_value, time_value, value) %>% arrange(time_value) %>% group_split()
geos_masks = lapply(mask_list, function(a) a %>% select(geo_value) %>% unlist() %>% unique()) %>% unlist()
mask_list = mask_list %>% setNames(geos_masks)
## Combine and sort by overlapping geos
case_list = case_list[geos_masks]
# assert_that(all(names(case_list) == names(mask_list)))
## Retain only the geos averaging more than X cases a day
# X = sum / numdays
# X = 2000 / 91
# X = ~22
large_geos = names(case_list)[which(sapply(case_list, function(a) a %>% summarize(sum(value))) > 2000)]
case_list = case_list[large_geos]
mask_list = mask_list[large_geos]
large_geos = names(mask_list)[which(sapply(mask_list, function(a) a %>% summarize(nrow(a))) > 50)]
mask_list = mask_list[large_geos]
case_list = case_list[large_geos]
# assert_that(all(names(case_list) == names(mask_list)))
scale function.ksmooth function with a chosen bandwidth (12 in this case).estimate_deriv function (in this case using the ss param) to calculate local derivatives0.03 for cases and -0.0007 for masks)0.0007 in this case)The plots with the red dots are cases, and the dots mark the beginning of a rise.
The plots with blue dots are the signal and, in this case, the dots mark the beginning of a decline in mask usage.
The plots come in pairs of twos, with the case plot above the mask plot for each county.
bw = 12
deriv_1_threshold = .03
deriv_2_threshold = 0.0007
deriv_1_threshold_masks = -0.007
smooth_case_list = case_list
smooth_mask_list = mask_list
smooth_case_deriv_list = vector(mode = "list", length = length(case_list))
smooth_mask_deriv_list = vector(mode = "list", length = length(mask_list))
par(mfcol = c(2,3))
# par(mar=c(3,4,1,1))
# To find rise points
# TODO can be cleaner
get_rise_points = function(data) {
point = ifelse(data$first_deriv >= deriv_1_threshold,
ifelse(lag(data$first_deriv) < deriv_1_threshold,
ifelse(data$second_deriv > deriv_2_threshold,
ifelse(lead(data$first_deriv, 2) > deriv_1_threshold,
ifelse(lead(data$first_deriv, 4) > deriv_1_threshold,
ifelse(lead(data$first_deriv, 6) > deriv_1_threshold,
ifelse(lead(data$first_deriv, 8) > deriv_1_threshold,
ifelse(lead(data$first_deriv, 10) > deriv_1_threshold, TRUE, FALSE), FALSE), FALSE), FALSE), FALSE), FALSE), FALSE), FALSE)
}
# To find fall point
# TODO can be cleaner
get_fall_points = function(data) {
ifelse(data$first_deriv <= deriv_1_threshold_masks,
ifelse(lag(data$first_deriv) > deriv_1_threshold_masks,
ifelse(data$second_deriv < deriv_2_threshold,
ifelse(lead(data$first_deriv, 2) < deriv_1_threshold_masks,
ifelse(lead(data$first_deriv, 4) < deriv_1_threshold_masks,
ifelse(lead(data$first_deriv, 6) < deriv_1_threshold_masks,
ifelse(lead(data$first_deriv, 8) < deriv_1_threshold_masks, TRUE, FALSE), FALSE), FALSE), FALSE), FALSE), FALSE), FALSE)
}
for(ii in (1:length(case_list))){
# Smooth cases and signal
smooth_case_list[[ii]]$value = case_list[[ii]] %>% mutate(value=scale(value)) %>% pull(value) %>% sm(bw)
smooth_mask_list[[ii]]$value = mask_list[[ii]] %>% mutate(value=scale(value)) %>% pull(value) %>% sm(bw)
# Create cols with first derivs for cases and signal
smooth_case_deriv <- modeltools::estimate_deriv(smooth_case_list[[ii]], method = "ss", col_name="first_deriv", n=28)
smooth_mask_deriv <- modeltools::estimate_deriv(smooth_mask_list[[ii]], method = "ss", col_name = "first_deriv", n = 28)
# Create cols with second derivs
# TODO can clean up
smooth_case_deriv_2 <- smooth_case_deriv
smooth_case_deriv_2$value = smooth_case_deriv$first_deriv
smooth_case_deriv_2 = modeltools::estimate_deriv(smooth_case_deriv_2, method = "ss", col_name="second_deriv", n=28)
smooth_case_deriv = smooth_case_deriv %>% add_column(second_deriv = NA)
smooth_case_deriv$second_deriv = smooth_case_deriv_2$second_deriv
smooth_mask_deriv_2 <- smooth_mask_deriv
smooth_mask_deriv_2$value = smooth_mask_deriv$first_deriv
smooth_mask_deriv_2 = modeltools::estimate_deriv(smooth_mask_deriv_2, method = "ss", col_name="second_deriv", n=28)
smooth_mask_deriv = smooth_mask_deriv %>% add_column(second_deriv = NA)
smooth_mask_deriv$second_deriv = smooth_mask_deriv_2$second_deriv
# Mark rise points
smooth_case_deriv = smooth_case_deriv %>% add_column(rise_point = FALSE)
smooth_case_deriv$rise_point = get_rise_points(smooth_case_deriv)
smooth_mask_deriv = smooth_mask_deriv %>% add_column(fall_point = FALSE)
smooth_mask_deriv$fall_point = get_fall_points(smooth_mask_deriv)
# Save data for next step
smooth_case_deriv_list[[ii]] = smooth_case_deriv
smooth_mask_deriv_list[[ii]] = smooth_mask_deriv
# Plot it
if (ii >= 215 && ii < 230) {
county = smooth_case_deriv[[1, 'geo_value']]
smooth_case_deriv %$% plot(x=time_value, y=value, type="l", main = county)
smooth_case_deriv %>% filter(rise_point) %$% points(x=time_value, y=value, col="red", lwd=10)
county = smooth_mask_deriv[[1, 'geo_value']]
smooth_mask_deriv %$% plot(x=time_value, y=value, type="l", main = county)
smooth_mask_deriv %>% filter(fall_point) %$% points(x=time_value, y=value, col="blue", lwd=10)
}
}
Now that we have our method for determining points of interest, let’s look at where the signal and case points of interest occur in relation to each other for each county where there is at least one point of interest for the case and the signal.
We measure recall as number of counties where there is at least one signal point of interest that falls some number of days or fewer before a case point of interest divided by the number of counties where there is at least one case point of interest. When we observed a case rise, how many times did a point of interest in our signal precede it?
We measure precision as number of counties where there is at least one signal point of interest that falls some number of days or fewer before a case point of interest divived by the number of counties where there is at least one signal point of interest. When we observed a point of interest in our signal, how many times did it precede a rise in cases?
In this case the number of days we look at is 21 in the case of mask usage, if it has an impact on case incidence it probably takes a few weeks. This number (along with all the other thresholding numbers in this notebook) is up for discussion.
Also up for discussion is how we measure success: are we looking at the county level, or at the “rise point” level, where we perform the recall and precision calculations for all points of interest, rather than for each county? What counts as success on the county level? At least one match between case and signal (as done here), or all points matched?
# Narrow down counties to those that have a rise point for both cases and signal
success_count = 0
case_total_count = 0
mask_total_count = 0
for (i in (1:length(smooth_case_deriv_list))) {
case_points = TRUE %in% smooth_case_deriv_list[[i]]$rise_point
if (case_points) {case_total_count = case_total_count + 1}
mask_points = TRUE %in% smooth_mask_deriv_list[[i]]$fall_point
if(mask_points){mask_total_count = mask_total_count + 1}
if(case_points && mask_points) {
case_dates = na.omit(smooth_case_deriv_list[[i]]$time_value[smooth_case_deriv_list[[i]]$rise_point==TRUE])
mask_dates = na.omit(smooth_mask_deriv_list[[i]]$time_value[smooth_mask_deriv_list[[i]]$fall_point==TRUE])
# Check if any of the mask points are <= 21 days before any of the case points
for (j in (1:length(mask_dates))) {
for (k in (1:length(case_dates))) {
if(case_dates[k] - mask_dates[j] <= 21 && case_dates[k] - mask_dates[j] > 0) {
success_count = success_count + 1
}
}
}
}
}
Success count (‘mask drop’ <= 21 days before ‘case rise’):
success_count
## [1] 198
Total number of counties with at least one ‘case rise’:
case_total_count
## [1] 412
Total number of counties with at least one ‘mask drop’:
mask_total_count
## [1] 344
success_count / case_total_count
## [1] 0.4805825
success_count / mask_total_count
## [1] 0.5755814
Some salient examples. Here are some good examples that show where mask usage drops before cases rise:
par(mfcol = c(2,3))
# Plots a list of examples
plot_examples = function(examples) {
for (i in (1:length(smooth_case_deriv_list))) {
if (smooth_case_deriv_list[[i]]$geo_value[1] %in% examples) {
county = smooth_case_deriv_list[[i]]$geo_value[1]
smooth_case_deriv_list[[i]] %$% plot(x=time_value, y=value, type="l", main = county)
smooth_case_deriv_list[[i]] %>% filter(rise_point) %$% points(x=time_value, y=value, col="red", lwd=15)
smooth_mask_deriv_list[[i]] %$% plot(x=time_value, y=value, type="l", main = county)
smooth_mask_deriv_list[[i]] %>% filter(fall_point) %$% points(x=time_value, y=value, col="blue", lwd=15)
}
}
}
# Some good examples
good_examples = list("01101", "06059", "08059", "12015", "18057", "37119", "39025", "41005", "42011", "45083")
plot_examples(good_examples)
Here are some bad examples that show counter examples to this trend, and also instances where the algorithm to find the rise/drop points showed some weakness:
par(mfcol = c(2,3))
# Some bad examples
bad_examples = list("12001", "12011", "21067", "28121", "29095", "34003", "45063", "48201")
plot_examples(bad_examples)
Section 1: Approach 1 for selecting time periods
In this approach we identify peaks (30-day window), select for the preceding local minima (7-day window), then select 14-days before this minima. We will first examine if this approach selects for the intended periods.
suppressMessages(library(covidcast))
suppressMessages(library(dplyr))
suppressMessages(library(ggplot2))
case_counts = readRDS(file="../case_counts.RDS")
cli=readRDS(file="../cli_signal.RDS")
peaks <- function(x, halfWindowSize) {
#https://stackoverflow.com/questions/10892803/calculate-peak-values-in-a-plot-using-r
windowSize <- halfWindowSize * 2 + 1
windows <- embed(x, windowSize)
#max.col(windows, "first") finds the maximum position for each row of a matrix, where first indicates how to break ties
localMaxima <- max.col(windows, "first") == halfWindowSize + 1
return(c(rep(FALSE, halfWindowSize), localMaxima, rep(FALSE, halfWindowSize)))
}
rescale_vector = function(x){
(x - min(x))/(max(x) - min(x))
}
# Function to transform from one range to another
trans = function(x, from_range, to_range) {
(x - from_range[1]) / (from_range[2] - from_range[1]) *
(to_range[2] - to_range[1]) + to_range[1]
}
#taken from previous DAP (Ryan)
# Red, cyan (similar to ggplot defaults), then yellow
ggplot_colors = c("#00AFBB", "#FC4E07")
# Function to produce a plot comparing the signals for one county
df_in = case_counts
df_fb = cli
plot_one = function(geo_value, title = NULL, xlab = NULL,
ylab1 = NULL, ylab2 = NULL, legend = TRUE
) {
# Filter down the signal data frames
given_geo_value = geo_value
df_fb_one = df_fb %>% filter(geo_value == given_geo_value)
df_in_one = df_in %>% filter(geo_value == given_geo_value)
# Compute ranges of the two signals
range1 = df_in_one %>% select("value") %>% range
range2 = df_fb_one %>% select("value") %>% range
# Convenience functions for our two signal ranges
trans12 = function(x) trans(x, range1, range2)
trans21 = function(x) trans(x, range2, range1)
# Find state name, find abbreviation, then set title
state_name = fips_to_name(paste0(substr(geo_value, 1, 2), "000"))
state_abbr = name_to_abbr(state_name)
title = paste0(fips_to_name(geo_value), ", ", state_abbr)
# Transform the combined signal to the incidence range, then stack
# these rowwise into one data frame
df = select(rbind(df_fb_one %>% mutate_at("value", trans21),
df_in_one), c("time_value", "value"))
df$signal = c(rep("% CLI-in-community", nrow(df_fb_one)),
rep("New COVID-19 cases", nrow(df_in_one)))
#df = df[which(df$time_value >= min_time_value & df$time_value <= max_time_value),]
# Finally, plot both signals
pos = ifelse(legend, "bottom", "none")
return(ggplot(df, aes(x = time_value, y = value)) +
geom_line(aes(color = signal), size = 1.5) +
scale_color_manual(values = ggplot_colors[1:2]) +
scale_y_continuous(name = ylab1, limits = range1,
sec.axis = sec_axis(trans = trans12,
name = ylab2)) +
labs(title = title, x = xlab) + theme_bw() +
theme(legend.pos = pos, legend.title = element_blank(),
axis.text = element_text(size = 12),
legend.text = element_text(size = 12),
axis.title = element_text(size = 12),
title = element_text(size = 13)))
}
suppressMessages(library(magrittr))
suppressMessages(library(tidyverse))
suppressMessages(library(data.table))
suppressMessages(library(pspline))
suppressMessages(library(lubridate))## Helper smoother function
sm <- function(y, bandwidth = 2){
n = length(y)
return(ksmooth(x = 1:(n-1), y = y, bandwidth = bandwidth, x.points = 1:n, kernel="normal")$y)
}
caselist=split(data.table::as.data.table(case_counts), by = "geo_value")
fblist=split(data.table::as.data.table(cli), by = "geo_value")
#santa clara 234, bronx ny 1863, miami dade county 372
example_ind = 372
fb_matching_ind = which(names(fblist) %in% names(caselist)[example_ind])
tt = 1:length(caselist[[example_ind]]$time_value)
fbtt = 1:length(fblist[[fb_matching_ind]]$time_value)
#myspline=splinefun(tt, sm(caselist[[2]]$value, 10))
case_signal_normalized = rescale_vector(caselist[[example_ind]]$value)
fb_signal_normalized = rescale_vector(fblist[[fb_matching_ind]]$value)
case_first_deriv = splinefun(tt, sm(case_signal_normalized, 10))(tt, 1)
case_second_deriv = splinefun(tt, sm(case_signal_normalized, 10))(tt, 2)
fb_first_deriv = splinefun(fbtt, sm(fb_signal_normalized, 10))(fbtt, 1)
fb_second_deriv = splinefun(fbtt, sm(fb_signal_normalized, 10))(fbtt, 2)
par(mfrow =c(4,2))
#Raw Signals Plotted
plot(caselist[[example_ind]]$time_value, caselist[[example_ind]]$value, type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " Case Counts"), col = 'red')
plot(fblist[[fb_matching_ind]]$time_value, fblist[[fb_matching_ind]]$value, type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " FB Community Survey"), col = 'cyan')
#Scaled Signals
plot(caselist[[example_ind]]$time_value, case_signal_normalized, type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " Case Counts (Scaled)"), col = 'red')
plot(fblist[[fb_matching_ind]]$time_value, fb_signal_normalized, type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " FB Community Survey (Scaled)"), col = 'cyan')
#Smoothed Signals (Scaled)
plot(caselist[[example_ind]]$time_value, sm(case_signal_normalized,10), type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " Case Counts (Smoothed & Scaled)"), col = 'red')
plot(fblist[[fb_matching_ind]]$time_value, sm(fb_signal_normalized,10), type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " FB Community Survey (Smoothed & Scaled)"), col = 'cyan')
#First Derivative
plot(caselist[[example_ind]]$time_value, case_first_deriv, type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16,col = 'red', main = paste0(caselist[[example_ind]]$geo_value[1], " Case Counts (First Derivative & Scaled)"))
plot(fblist[[fb_matching_ind]]$time_value, fb_first_deriv, type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16,col = 'cyan', main = paste0(caselist[[example_ind]]$geo_value[1], " FB Community Survey (First Derivative & Scaled)"))
Overlayed Plots Example
par(mfrow = c(2,1))
#Overlayed Signals
plot(caselist[[example_ind]]$time_value, sm(case_signal_normalized,10), type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " Case Counts & FB Comm. Survey (Smoothed & Scaled)"), col = 'red',
ylim=c(0,1))
lines(fblist[[fb_matching_ind]]$time_value, sm(fb_signal_normalized,10), type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, col = 'cyan')
plot(caselist[[example_ind]]$time_value, case_first_deriv, type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " Case Counts & FB Comm. Survey (First Derivative & Scaled)"), col = 'red',
ylim=c(min(case_first_deriv, fb_first_deriv),max(case_first_deriv, fb_first_deriv)))
lines(fblist[[fb_matching_ind]]$time_value, fb_first_deriv, type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, col = 'cyan')
threshold = 0.20
par(mfrow = c(6,1))
plot(caselist[[example_ind]]$time_value, sm(case_signal_normalized,10), type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " Selection Criteria 1: First derivative > 0 and above 75th percentile"),
col = ifelse(case_first_deriv>quantile(case_first_deriv, 0.75) &
case_first_deriv>0 , 'black', 'red'), ylim=c(0,1))
plot(caselist[[example_ind]]$time_value, sm(case_signal_normalized,10), type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " Selection Criteria 2: Second Derivative < 0"), col = ifelse(case_second_deriv < 0, 'black', 'red'), ylim=c(0,1))
plot(caselist[[example_ind]]$time_value, sm(case_signal_normalized,10), type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " Selection Criteria 3: First Derivative >= 0 and Second Derivative Negative (Local Max)"), col = ifelse(
(case_first_deriv>= 0) &
case_second_deriv < 0, 'black', 'red'), ylim=c(0,1))
#plot(caselist[[example_ind]]$time_value, sm(case_signal_normalized,10), type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, #main = paste0(caselist[[example_ind]]$geo_value[1], " Selection Criteria 3b: Regions where signal is flat"), col = #ifelse(abs(case_first_deriv) <= quantile(abs(case_first_deriv), 0.5), 'black', 'red'), ylim=c(0,1))
flat_regions = which(abs(case_first_deriv) <= quantile(abs(case_first_deriv), 0.5))
s <- split(flat_regions, cumsum(c(TRUE, diff(flat_regions) != 1)))
s <- unlist(lapply(s, function(x) {
if(length(x) > 5)
x
}), use.names = F)
plot(caselist[[example_ind]]$time_value, sm(case_signal_normalized,10), type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " Selection Criteria 3c: Regions where signal is flat"), col = ifelse(1:length(caselist[[example_ind]]$time_value) %in% s, 'black', 'red'), ylim=c(0,1))
increasing_period= which(case_first_deriv>quantile(case_first_deriv, 0.75) & case_first_deriv>0)
s <- split(increasing_period, cumsum(c(TRUE, diff(increasing_period) != 1)))
s <- unlist(lapply(s, function(x){
if(sm(case_signal_normalized,10)[max(x)]/sm(case_signal_normalized,10)[min(x)] > (1+threshold))
{
c(x, (max(x)+1):(max(x)+5))
}
}), use.names = F)
#s <- unlist(lapply(s, function(x) { c(x, (max(x)+1):(max(x)+5))}), use.names = F)
plot(caselist[[example_ind]]$time_value, sm(case_signal_normalized,10), type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, main = paste0(caselist[[example_ind]]$geo_value[1], " Selection Criteria 4: First Derivative > 0 and above 75th percentile and include extra 5 days to the right (to capture local max) and \n has > 20% increase in cases"), col = ifelse(1:length(caselist[[example_ind]]$time_value) %in% s, 'black', 'red'), ylim=c(0,1))
increasing_period= which(fb_first_deriv>quantile(fb_first_deriv, 0.75) & fb_first_deriv>0)
s <- split(increasing_period, cumsum(c(TRUE, diff(increasing_period) != 1)))
s <- unlist(lapply(s, function(x){
if(sm(fb_signal_normalized,10)[max(x)]/sm(fb_signal_normalized,10)[min(x)] > (1+threshold))
{
c(x, (max(x)+1):(max(x)+5))
}
}), use.names = F)
plot(fblist[[fb_matching_ind]]$time_value, sm(fb_signal_normalized,10), type = 'b', lwd = 2, xlab = "Date", ylab = "Value",pch = 16, col = ifelse(1:length(fblist[[fb_matching_ind]]$time_value) %in% s, 'black', 'cyan'),
main = paste0(caselist[[example_ind]]$geo_value[1], " Selection Criteria 4 (FB Survey): First Derivative > 0 and above 75th percentile and include extra 5 days to the right (to capture local max) \n and has >20% increase in FB survey"), ylim=c(0,1))
Identifying periods of increases for all counties, calculating precision, recall
caselist=split(data.table::as.data.table(case_counts), by = "geo_value")
fblist=split(data.table::as.data.table(cli), by = "geo_value")
#needs to be put in a loop, this is for individual cases
num_significant_cases_increases = 0
num_positive_recall_cases=0
num_cases_remain_flat = 0
num_fb_increase_flat_case_regions=0
threshold = 0.2
processed_signals_list=vector("list", length(caselist))
precision_performance_list=vector("list", length(caselist))
recall_performance_list=vector("list", length(caselist))
names(processed_signals_list)<-names(caselist)
names(precision_performance_list)<-names(caselist)
names(recall_performance_list)<-names(caselist)
for(i in 1:length(caselist))
{
if(i %% 100 == 0)
cat(i, " of ", length(caselist), "\n")
if(any(caselist[[i]]$value < 0) | var(caselist[[i]]$value) == 0 | !(names(caselist)[i] %in% names(fblist)))
next
case_elem = caselist[[i]]
fb_elem = fblist[[which(names(fblist) %in% names(caselist)[i])]]
if(nrow(case_elem) < 30 | nrow(fb_elem) < 30)
next
colnames(fb_elem)[7] <- c("FB.Value")
merged_df= merge(x = case_elem, y = fb_elem, by = "time_value")
merged_df$value = rescale_vector(merged_df$value)
merged_df$FB.Value = rescale_vector(merged_df$FB.Value)
smoothed_county_cases = sm(merged_df$value, 10)
smoothed_fb_survey = sm(merged_df$FB.Value, 10)
tt = 1:length(merged_df$time_value)
case_first_deriv = splinefun(tt, smoothed_county_cases)(tt, 1)
case_second_deriv = splinefun(tt, smoothed_county_cases)(tt, 2)
fb_first_deriv = splinefun(tt, smoothed_fb_survey)(tt, 1)
fb_second_deriv = splinefun(tt, smoothed_fb_survey)(tt, 2)
regions_case_increases = which(case_first_deriv>quantile(case_first_deriv, 0.75) & case_first_deriv>0)
increasing_case_time_period_list <- split(regions_case_increases, cumsum(c(TRUE, diff(regions_case_increases) != 1)))
regions_fb_increases = which(fb_first_deriv>quantile(fb_first_deriv, 0.75) & fb_first_deriv>0)
increasing_fb_time_period_list <- split(regions_fb_increases, cumsum(c(TRUE, diff(regions_fb_increases) != 1)))
if(length(regions_case_increases) == 0 | length(regions_fb_increases) == 0)
next
increasing_case_time_period_list <- lapply(increasing_case_time_period_list, function(x){
if(smoothed_county_cases[max(x)]/smoothed_county_cases[min(x)] > (1+threshold) & smoothed_county_cases[min(x)] !=0)
{
c(x, (max(x)+1):(max(x)+5))
}
})
increasing_case_time_period_list = increasing_case_time_period_list[lengths(increasing_case_time_period_list)!=0]
num_significant_cases_increases = num_significant_cases_increases + length(increasing_case_time_period_list)
increasing_fb_time_period_list <- lapply(increasing_fb_time_period_list, function(x){
if(smoothed_fb_survey[max(x)]/smoothed_fb_survey[min(x)] > (1+threshold) & smoothed_fb_survey[min(x)]!=0)
{
c(x, (max(x)+1):(max(x)+5))
}
})
increasing_fb_time_period_list = increasing_fb_time_period_list[lengths(increasing_fb_time_period_list)!=0]
does_fb_increase_precede_case <- lapply(increasing_case_time_period_list, function(x){
beginning_case_increase = min(x)
beginning_fb_increase = unlist(lapply(increasing_fb_time_period_list, min), use.names = F)
1*any((beginning_case_increase - beginning_fb_increase > 0) & (beginning_case_increase - beginning_fb_increase <= 14))
})
num_positive_recall_cases = num_positive_recall_cases+sum(unlist(does_fb_increase_precede_case, use.names = F))
flat_case_regions = which(abs(case_first_deriv) <= quantile(abs(case_first_deriv), 0.5))
flat_case_time_period_list <- split(flat_regions, cumsum(c(TRUE, diff(flat_regions) != 1)))
flat_case_time_period_list <- lapply(flat_case_time_period_list, function(x) {
if(length(x) > 5)
{
x
}
})
flat_case_time_period_list= flat_case_time_period_list[lengths(flat_case_time_period_list)!=0]
num_cases_remain_flat = num_cases_remain_flat+length(flat_case_time_period_list)
does_fb_increase_flat_case <- lapply(flat_case_time_period_list, function(x){
beginning_case_increase = min(x)
beginning_fb_increase = unlist(lapply(increasing_fb_time_period_list, min), use.names = F)
1*any((beginning_case_increase - beginning_fb_increase > 0) & (beginning_case_increase - beginning_fb_increase <= 14))
})
num_fb_increase_flat_case_regions = num_fb_increase_flat_case_regions+sum(unlist(does_fb_increase_flat_case, use.names = F))
recall_performance_list[[i]]<-sum(unlist(does_fb_increase_precede_case, use.names = F))/length(increasing_case_time_period_list)
precision_performance_list[[i]]<-1-sum(unlist(does_fb_increase_flat_case, use.names = F))/length(flat_case_time_period_list)
merged_df$Case_Increasing_Regions = ifelse(1:nrow(merged_df) %in% unlist(increasing_case_time_period_list, use.names = F),1,0)
merged_df$FB_Increasing_Regions = ifelse(1:nrow(merged_df) %in% unlist(increasing_fb_time_period_list, use.names = F),1,0)
merged_df$Flat_Case_Regions = ifelse(1:nrow(merged_df) %in% unlist(flat_case_time_period_list, use.names = F),1,0)
merged_df$Flat_FB_Regions = ifelse(1:nrow(merged_df) %in% unlist(increasing_fb_time_period_list, use.names = F),1,0)
processed_signals_list[[i]] <- merged_df
}
## 100 of 3193
## 200 of 3193
## 300 of 3193
## 400 of 3193
## 500 of 3193
## 600 of 3193
## 700 of 3193
## 800 of 3193
## 900 of 3193
## 1000 of 3193
## 1100 of 3193
## 1200 of 3193
## 1300 of 3193
## 1400 of 3193
## 1500 of 3193
## 1600 of 3193
## 1700 of 3193
## 1800 of 3193
## 1900 of 3193
## 2000 of 3193
## 2100 of 3193
## 2200 of 3193
## 2300 of 3193
## 2400 of 3193
## 2500 of 3193
## 2600 of 3193
## 2700 of 3193
## 2800 of 3193
## 2900 of 3193
## 3000 of 3193
## 3100 of 3193
print(c(num_positive_recall_cases/num_significant_cases_increases, 1-num_fb_increase_flat_case_regions/num_cases_remain_flat))
## [1] 0.3609703 0.9014611
precision_vector=unlist(precision_performance_list, use.names = T)
recall_vector=unlist(recall_performance_list, use.names = T)
matching_counties=intersect(names(precision_vector), names(recall_vector))
performance_df = data.frame(County_Code = matching_counties,
Precision = precision_vector[match(matching_counties, names(precision_vector))],
Recall = recall_vector[match(matching_counties, names(recall_vector))],
stringsAsFactors = F)
performance_df$F1_score = 2*(performance_df$Precision * performance_df$Recall)/(performance_df$Precision + performance_df$Recall)
hist(performance_df$F1_score, xlab = "F1 Score", main = "Frequency Distribution of F1 Scores", col = "orange")
performance_df=performance_df[order(performance_df$F1_score, decreasing = T),]
print(performance_df[1:20,])
## County_Code Precision Recall F1_score
## 01015 01015 1 1 1
## 01045 01045 1 1 1
## 04003 04003 1 1 1
## 06029 06029 1 1 1
## 06095 06095 1 1 1
## 08001 08001 1 1 1
## 08031 08031 1 1 1
## 08043 08043 1 1 1
## 08101 08101 1 1 1
## 12023 12023 1 1 1
## 12075 12075 1 1 1
## 12083 12083 1 1 1
## 12087 12087 1 1 1
## 12089 12089 1 1 1
## 12131 12131 1 1 1
## 13029 13029 1 1 1
## 13045 13045 1 1 1
## 13223 13223 1 1 1
## 17073 17073 1 1 1
## 17179 17179 1 1 1
suppressMessages(library(patchwork))
plot_list=vector("list", 20)
for(i in 1:20)
{
county_name = performance_df$County_Code[i]
plot_list[[i]]<-plot_one(county_name, xlab = "Date",
ylab1 = "Daily new confirmed COVID-19 cases",
ylab2 = "% of people who know someone with CLI")
print(plot_list[[i]])
}